Linking CO and KO

housecleaning: pull out the pieces that can go somewhere else to get a clean version of the iPython notebook 4/12/2016 ; 4/15/2016

Cell 12 is the place to change the path for the transcriptomics data...too big for GitHub


In [2]:
#%reset

In [3]:
import pandas as pd
import numpy as np
import re
import os
from sklearn.cluster import KMeans
import cPickle as cpk

import palettable as pal
import matplotlib.pyplot as plt
import matplotlib as mpl 
mpl.rcParams['pdf.fonttype'] = 42

from Bio.KEGG.REST import *

%matplotlib inline

In [4]:
mtabFile = 'RImetabolites_isomers.2015.08.27.csv' #first column is RInumber

In [5]:
CO_fromMATLAB=pd.read_csv(mtabFile, index_col='RInumber')

In [6]:
#make a list of the unique CO numbers for the CreateHash_COtoKO.py. Export the list as CSV
td = CO_fromMATLAB.groupby('cNumber').count()
COnumbers = td.drop(list(td.columns.values),axis=1)
del td
COnumbers.to_csv('exportCOnumbers.csv',header=True)

Write a couple of functions to swap between CO and RInumbers


In [7]:
def findRInumber(dataIn,KEGGin):
    #find possible RI numbers for a given KEGG number. 
    dataOut = []
    for i,KEGG in enumerate(dataIn['KEGG']):
        if KEGG == KEGGin:
            t = dataIn.index[i]
            dataOut.append(t)
    return dataOut
##For example: this will give back one row, C18028 will be multiple
#m = findRInumber(forRelatedness,'C00078') 

def convertRItoCO(dataIn,RIin):
    #do the reverse, given an RInumber find the cNumber
    dataOut = dataIn.loc[RIin].loc['cNumber']
    return dataOut
##This will always be a single value
#m = convertRItoCO(forRelatedness,'RI2')

#slight change, no need to send in a comparison file if it always the same thing
def convertRItoCO2(RIin):
    #do the reverse, given an RInumber find the cNumber
    dataOut = CO_fromMATLAB.loc[RIin].loc['cNumber']
    return dataOut
##This will always be a single value, also uses CO_fromMATLAB as input

In [8]:
#This grabs the CO/KO links from the KEGG website. The actual code is in the 
#CreateHash_COtoKO.py that Harriet wrote. Note that since the exportCOnumbers.csv 
#file is a unique list of C number we essentially already have a lookup table 
#for all the metabolites of interest.

if os.path.isfile('exportCOnumbers.csv' + '.pickle'):
    #just read in the file
    WorkingFile = cpk.load(open('exportCOnumbers.csv.pickle','r'))
else:
    #need to make the file
    filename = "CreateHash_COtoKO.py"
    %run $filename exportCOnumbers.csv 
    #then read in the file
    WorkingFile = cpk.load(open('exportCOnumbers.csv.pickle','r'))

In [9]:
def SplitCODict(WorkingFile):
    CO_withoutKO={}
    CO_withKO={}
    for CO in WorkingFile.keys():

        if WorkingFile[CO]['Related KO']==[]:
            CO_withoutKO[CO]=WorkingFile[CO]
        else:
            CO_withKO[CO]=WorkingFile[CO]
    return CO_withoutKO, CO_withKO

CO_withoutKO, CO_withKO=SplitCODict(WorkingFile)
print 'There are', len(CO_withKO), 'COs with an associated KO.', len(CO_withoutKO), 'are not associated with a KO.'


There are 404 COs with an associated KO. 1438 are not associated with a KO.

In [10]:
AllKO=[]
AllCO=[]
for key in CO_withKO:
    AllKO.append(CO_withKO[key]['Related KO'])
    AllCO.append(CO_withKO[key]['Related CO'])
AllKO=list(set([item for sublist in AllKO for item in sublist]))
AllCO=list(set([item for sublist in AllCO for item in sublist]))

In [11]:
#go through CO_RawData_all one row at a time (inefficient for sure, but I understand 
#what is happening), then make a new column in CO_RawData_all that is True/False
CO_fromMATLAB['inList'] = ""

for idx in range(0,len(CO_fromMATLAB)):
# for idx in range(0):
    fc = CO_fromMATLAB.ix[idx,'cNumber']
    if fc in AllCO:
        CO_fromMATLAB.ix[idx,'inList'] = True
    else:
        CO_fromMATLAB.ix[idx,'inList'] = False

In [12]:
#can't quite figure out how to do this in one step.
m = CO_fromMATLAB[CO_fromMATLAB['inList']==True]
CO_metadata_pruned = m.loc[:,['cNumber','ChargedMass','RT','ionMode']]

#this list of days is useful, so define it up front.
dayList = ['S1','S2','S3','S4','S5'] 
CO_RawData_pruned = m.loc[:,dayList]
del m

In [13]:
#Load PhytoKEGG Annotations
#from Harriet...doesn't work, complaining about header=False 
#AllPhytoKO_ann=pd.read_table('AllPhytoKegg_annotated.tab', header=False, delimiter='\t')

#try this instead (note double \\ at end)
pathToData = 'Z:\KLtemp\TranscriptomicsData_Feb2016\\'
AllPhytoKO_ann=pd.read_table((pathToData + 'AllPhytoKegg_annotated.tab'), delimiter='\t')

InsituCounts=pd.read_table((pathToData + 'AllInsitu_NoZero.tab'), index_col='gID')

In [14]:
#normalize to the library size
InsituTPM=InsituCounts.copy()
InsituTPM[['S1', 'S2', 'S3', 'S4', 'S5']]=(InsituCounts[['S1', 'S2', 'S3', 'S4', 'S5']]/InsituCounts[['S1', 'S2', 'S3', 'S4', 'S5']].sum())*10**6

#Add annotation information
InsituCounts=InsituCounts.join(AllPhytoKO_ann)
InsituTPM=InsituTPM.join(AllPhytoKO_ann)
InsituCounts=InsituCounts.dropna()
InsituTPM=InsituTPM.dropna()

KO_RawData=InsituTPM.groupby('kID').sum()

In [14]:
def NormalizeToMean(DF):
    DF_meanNorm=DF.copy()
    out=DF_meanNorm.copy()
    DF_meanNorm['mean']=DF.mean(axis=1)

    for i in out.columns:
        out[i]=DF_meanNorm[i]/DF_meanNorm['mean']
    DF_meanNorm=DF_meanNorm.T.drop('mean').T
    return out

def NormalizeToMax(DF):
    DF_meanNorm=DF.copy()
    out=DF_meanNorm.copy()
    DF_meanNorm['max']=DF.max(axis=1)
    for i in out.columns:
        out[i]=DF_meanNorm[i]/DF_meanNorm['max']
    DF_meanNorm=DF_meanNorm.T.drop('max').T
    return out

def NormalizeToMean_CV(DF):
    out=DF.copy()
    out['mean']=DF.mean(axis=1)
    out['SD']=DF.std(axis=1)
    
    out['CV']=out['SD']/out['mean']
    return out

In [15]:
if False: 
    #norm2Mean seems based (see plots in earlier notebooks)
    KO_Norm2Mean=NormalizeToMean(KO_RawData) 

    ##two versions of the CO data: (1) regular and (2) inverse
    CO_Norm2Mean_regular=NormalizeToMean(CO_RawData_pruned) 
    CO_Norm2Mean_inverse=NormalizeToMean(1/CO_RawData_pruned)

In [16]:
#use _finalOption variable names to make life easier
KO_finalOption = KO_RawData.loc[AllKO].dropna()
CO_final_regular = CO_RawData_pruned.dropna() #already 'limited' this before the normalization
#CO_final_inverse = CO_Norm2Mean_inverse.dropna() #already 'limited' this before the normalization

In [17]:
##Combine the CO and the KO data in preparation for K means clustering
Combined_KO_CO_regular=KO_finalOption.append(CO_final_regular.loc[:,(dayList)])
#Combined_KO_CO_inverse=KO_finalOption.append(CO_final_inverse.loc[:,(dayList)])

In [18]:
def kmeanCluster(data,nc):
    #kmeans=KMeans(n_clusters=nc)
    kmeans = KMeans(n_clusters = nc, max_iter = 1000, n_init = 50, init = 'random')
    kmeans.fit(data)
    newData=data.copy()
    newData['kmeans']=kmeans.labels_
    return newData
    
def PlotKmeans(KmeansPD, kSize=10, figSizeX=1, figSizeY=5, color='k'):
    KmeansPD['kmeans'].plot(kind='hist', bins=kSize, color=color)
    fig,axs=plt.subplots(figSizeX, figSizeY)
    axs=[item for sublist in axs for item in sublist]
    fig.set_size_inches(9,12)
    for ax, y in zip(axs,range(kSize)):
        pltData=KmeansPD[KmeansPD.kmeans==y].T.drop('kmeans')
        pltData.plot(ax=ax, legend=False, grid=False, color=color)

Move forward with 'best' number of clusters --> use Determine_Optimal_Number_Kmeans_Groups.ipynb to determine this number


In [19]:
#setting # of clusters manually, also some good options with lower # of clusters I think
#this number will get used later when plotting up the BRITE categories and the Kmeans clusters
makeNclusters = 6

In [23]:
#do the K-means clustering with the final # of clusters
CcoClust_regular=kmeanCluster(Combined_KO_CO_regular, makeNclusters) 

#CcoClust_inverse=kmeanCluster(Combined_KO_CO_inverse, makeNclusters)

In [25]:
CcoClust_regular.head(5)


Out[25]:
S1 S2 S3 S4 S5 kmeans
K00509 1.371790 39.039095 1.098119 0.985132 1.474082 0
K00503 1.933999 3.847460 6.281711 3.674274 5.449637 0
K00500 7.511114 32.829680 11.984844 8.493431 26.555812 4
K00505 8.230742 6.491661 8.477948 4.579530 10.921608 0
K01101 24.433610 37.583299 54.882318 142.045315 52.262910 3

In [30]:
CcoClust_regular.kmeans.unique()


Out[30]:
array([0, 4, 3, 1, 5, 2], dtype=int64)

In [31]:
CcoClust_regular['kmeans'].value_counts()


Out[31]:
0    1938
4      84
3      35
1      21
2       4
5       2
Name: kmeans, dtype: int64

In [ ]:
#import matplotlib as mpl
def PlotKmeansCombined(KmeansPD, kSize=10, figSizeX=2, figSizeY=3, color='k'):
    KmeansPD['kmeans'].plot(kind='hist', bins=kSize, color='k',range = (0,kSize),align = 'left')
    fig,axs=plt.subplots(figSizeX, figSizeY)
    axs=[item for sublist in axs for item in sublist]
    fig.set_size_inches(15,9)
    i=KmeansPD.index
    i=list(i)
    Ks=re.compile('K.*')
    #Cs=re.compile('C.*')
    Cs = re.compile('R.*') #this is the RInumber I created...for the moment, do not need the Cnumber
    C = filter(Cs.search, i)  
    K = filter(Ks.search, i)  
    Ksplit=KmeansPD.loc[K]
    Csplit=KmeansPD.loc[C]
    for ax, y in zip(axs,range(kSize)):
        KData=Ksplit[Ksplit.kmeans==y].T.drop('kmeans')
        KData.plot(ax=ax, legend=False, grid=False, color='b')
        CData=Csplit[Csplit.kmeans==y].T.drop('kmeans')
        CData.plot(ax=ax, legend=False, grid=False, color='r')
        SumKC=len(KData.T)+len(CData.T)
        KPct=(len(KData.T))
        CPct=(len(CData.T))
        ax.set_title('nGenes ' + str(KPct) + ', nCpds ' + str(CPct) + ', Cluster ' + str(y))
        ax.set_ylim([0,5])
    return fig
    #fig.savefig(figureName)

In [34]:
#import matplotlib as mpl
def PlotKmeansCombined_2(KmeansPD, kSize=10, figSizeX=2, figSizeY=3, color='k'):
    KmeansPD['kmeans'].plot(kind='hist', bins=kSize, color='k',range = (0,kSize),align = 'left')
    fig,axs=plt.subplots(figSizeX, figSizeY)
    axs=[item for sublist in axs for item in sublist]
    fig.set_size_inches(15,9)
    i=KmeansPD.index
    i=list(i)
    Ks=re.compile('K.*')
    #Cs=re.compile('C.*')
    Cs = re.compile('R.*') #this is the RInumber I created...for the moment, do not need the Cnumber
    C = filter(Cs.search, i)  
    K = filter(Ks.search, i)  
    Ksplit=KmeansPD.loc[K]
    Csplit=KmeansPD.loc[C]
    for ax, y in zip(axs,range(kSize)):
        KData=Ksplit[Ksplit.kmeans==y].T.drop('kmeans')
        CData=Csplit[Csplit.kmeans==y].T.drop('kmeans')
        
        if CData.empty and KData.empty:
            continue
        
        elif CData.empty:
            KData.plot(ax=ax, legend=False, grid=False, color='r')
        
        elif KData.empty:
            CData.plot(ax=ax, legend=False, grid=False, color='k')
                   
      
        else:
            CData.plot(ax=ax, legend=False, grid=False, color='k')
            KData.plot(ax=ax, legend=False, grid=False, color='r')
            
        
#         KData.plot(ax=ax, legend=False, grid=False, color='b')
#         CData.plot(ax=ax, legend=False, grid=False, color='r')
        
        
        SumKC=len(KData.T)+len(CData.T)
        KPct=(len(KData.T))
        CPct=(len(CData.T))
        ax.set_title('nGenes ' + str(KPct) + ', nCpds ' + str(CPct) + ', Cluster ' + str(y))
        #ax.set_ylim([0,5])
    return fig
    #fig.savefig(figureName)

In [35]:
f = PlotKmeansCombined_2(CcoClust_regular,makeNclusters) 
f.savefig('Kmeans_regular_beforeMean.png')
del f



In [24]:
f = PlotKmeansCombined(CcoClust_inverse,makeNclusters) 
f.savefig('Kmeans_inverse.png')
del f



In [168]:
#need this cell to get the taxonomic information used in the species plots later on

#load in the species/group information
Group_Species=pd.read_table('GrpSpecies',delimiter=' ').T.drop(['MMETSP', 
                                                  'MMETSP.1']).T.drop_duplicates().set_index('SName')

InsituTPMGrped=InsituTPM.groupby(['kID','sgID']).sum().reset_index().set_index('sgID')

Group_Species=Group_Species.reset_index()
#get the specific group combos
Dia=Group_Species[Group_Species['Grp']=='Bacillariophyta']
Din=Group_Species[Group_Species['Grp']=='Dinophyta']
Oth=Group_Species[((Group_Species['Grp']!='Dinophyta')&
                  (Group_Species['Grp']!='Bacillariophyta'))]
#get the insitu counts
Insitu_TPM_DIA=InsituTPMGrped.loc[Dia['SName']].groupby('kID').sum()
Insitu_TPM_DIN=InsituTPMGrped.loc[Din['SName']].groupby('kID').sum()
Insitu_TPM_Oth=InsituTPMGrped.loc[Oth['SName'].dropna()].groupby('kID').sum()

D=set(Insitu_TPM_DIA.index)
N=set(Insitu_TPM_DIN.index)
O=set(Insitu_TPM_Oth.index)

In [26]:
fig, axs=plt.subplots(2,3)
fig.set_size_inches(9,6)
axs=axs.flatten()
for i in set(CcoClust_regular['kmeans']):
    ClusterK = CcoClust_regular[CcoClust_regular['kmeans']==i]
    ClusterKind=set([k for k in ClusterK.index if k.startswith('K')])

    Di=D.intersection(ClusterKind)
    Ni=N.intersection(ClusterKind)
    Oi=O.intersection(ClusterKind)

    Dsum=Insitu_TPM_DIA.loc[Di].sum()
    Nsum=Insitu_TPM_DIN.loc[Ni].sum()
    Osum=Insitu_TPM_Oth.loc[Di].sum()

    ax=axs[i]
    ax.stackplot(range(5), Dsum, Nsum, Osum, colors=
                 pal.colorbrewer.qualitative.Set3_6_r.hex_colors, lw=0)
    ax.set_xticks(range(5))
    ax.set_xticklabels([1,2,3,4,5])


C:\Anaconda\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The set_color_cycle attribute was deprecated in version 1.5. Use set_prop_cycle instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [27]:
fig, axs=plt.subplots(2,3)
fig.set_size_inches(9,6)
axs=axs.flatten()
for i in set(CcoClust_inverse['kmeans']):
    ClusterK = CcoClust_inverse[CcoClust_inverse['kmeans']==i]
    ClusterKind=set([k for k in ClusterK.index if k.startswith('K')])

    Di=D.intersection(ClusterKind)
    Ni=N.intersection(ClusterKind)
    Oi=O.intersection(ClusterKind)

    Dsum=Insitu_TPM_DIA.loc[Di].sum()
    Nsum=Insitu_TPM_DIN.loc[Ni].sum()
    Osum=Insitu_TPM_Oth.loc[Di].sum()

    ax=axs[i]
    ax.stackplot(range(5), Dsum, Nsum, Osum, colors=
                 pal.colorbrewer.qualitative.Set3_6_r.hex_colors, lw=0)
    ax.set_xticks(range(5))
    ax.set_xticklabels([1,2,3,4,5])


focus on things with common linked reactions...


In [28]:
#But...for the CheckRelatedness...do need to go back to the cNumber...
#for now, easiest to just make yet another matrix and put the cNumbers back in.
forRelatedness_regular = CcoClust_regular.copy(deep=True) #make a copy of the CcoClust data frame
forRelatedness_regular.insert(0,'KEGG',"") #add a column called 'KEGG'

forRelatedness_inverse = CcoClust_inverse.copy(deep=True) #make a copy of the CcoClust data frame
forRelatedness_inverse.insert(0,'KEGG',"") #add a column called 'KEGG'

In [29]:
for idx in range(0,len(forRelatedness_regular)):
    t = forRelatedness_regular.iloc[idx,:].name

    if t[0]=='R':
        #go find the matching cNumber in CO_RawData_all
        t2 = CO_fromMATLAB.loc[t,('cNumber')]
        forRelatedness_regular.ix[idx,('KEGG')] = t2
    elif t[0] == 'K':
        #just copy the K number over
        forRelatedness_regular.ix[idx,('KEGG')] = t
        
for idx in range(0,len(forRelatedness_inverse)):
    t = forRelatedness_inverse.iloc[idx,:].name

    if t[0]=='R':
        #go find the matching cNumber in CO_RawData_all
        t2 = CO_fromMATLAB.loc[t,('cNumber')]
        forRelatedness_inverse.ix[idx,('KEGG')] = t2
    elif t[0] == 'K':
        #just copy the K number over
        forRelatedness_inverse.ix[idx,('KEGG')] = t

In [30]:
def CheckRelatedness(inClust,nC):
    df=pd.DataFrame(columns=['Common Rxn','No linked Rxn'], index=range(nC))
    for n in range(nC):
        kClust=inClust[inClust.kmeans==n]
        #i=kClust.index
        i = kClust.KEGG #change the new column I created with Cnumbers and Knumbers
        i=list(i)
        Csearc=re.compile('C.*') #re is regular expression...perl-like 
        Cs = filter(Csearc.search, i)
        Ksearc=re.compile('K.*')
        Kis = filter(Ksearc.search, i)
        Kis=set(Kis)
        Ks=[]
        for c in Cs:
            if c in CO_withKO.keys():
                Ks.append(CO_withKO[c]['Related KO'])
        Ks=set([item for sublist in Ks for item in sublist])
        df.loc[n,'Common Rxn']=len(Kis.intersection(Ks))
        df.loc[n, 'No linked Rxn']=len(Kis)-len(Kis.intersection(Ks))
    df.plot(kind='bar', stacked=True, colormap=pal.colorbrewer.diverging.PRGn_5.get_mpl_colormap(), grid=False)

In [31]:
CheckRelatedness(forRelatedness_regular, makeNclusters)



In [32]:
CheckRelatedness(forRelatedness_inverse, makeNclusters)


Grab the information from KEGG for plotting and bean counting


In [33]:
allPathways = kegg_list("pathway").read()
len(allPathways.split('\n'))
#number here is the # of pathways at KEGG, up to 486 by 4/13/2016


Out[33]:
487

In [34]:
trimPath = []
current_section = None
for line in allPathways.rstrip().split("\n"):
    tp = line[8:13]
    trimPath.append('ko' + tp)
    
#have some cases where KEGG will send back a pathway, but the pathway itself is not searchable...seems to 
#be a KEGG bug, 'ko00351' was first, then realized there are many of these,
#did this list manually since I thought it would be short...
toDelete = ('ko00351', 'ko01010','ko01060',  'ko01061', 'ko01062', 'ko01063', 
            'ko01064', 'ko01065', 'ko01066', 'ko01070', 'ko07011', 'ko07012', 
            'ko07013', 'ko07014', 'ko07015', 'ko07016', 'ko07017', 'ko07018', 
            'ko07019', 'ko07020', 'ko07021', 'ko07023', 'ko07024', 'ko07025', 
            'ko07026', 'ko07027', 'ko07028', 'ko07029', 'ko07030', 'ko07031', 
            'ko07032', 'ko07033', 'ko07034', 'ko07035', 'ko07036', 'ko07037', 
            'ko07038', 'ko07039', 'ko07040', 'ko07041', 'ko07042', 'ko07043', 
            'ko07044', 'ko07045', 'ko07046', 'ko07047', 'ko07048', 'ko07049', 
            'ko07050', 'ko07051', 'ko07052', 'ko07053', 'ko07054', 'ko07055', 
            'ko07056', 'ko07057', 'ko07110', 'ko07112', 'ko07114', 'ko07117', 
            'ko07211', 'ko07212', 'ko07213', 'ko07214', 'ko07215', 'ko07216', 
            'ko07217', 'ko07218', 'ko07219', 'ko07220', 'ko07221', 'ko07222', 
            'ko07223', 'ko07224', 'ko07225', 'ko07226', 'ko07227', 'ko07228', 
            'ko07229', 'ko07230', 'ko07231', 'ko07232', 'ko07233', 'ko07234', 
            'ko07235', 'ko04933')

#probably a way to do this without the for loop, but this will work
for item in toDelete:
    trimPath.remove(item)

In [35]:
shortList = ['ko03010','ko03013','ko00140'] #for testing

In [36]:
import fxn_gatherDetails
##if I make a change, have to reload the function:
#reload(fxn_gatherDetails)

In [37]:
#usePathway = shortList #for testing...use complete list later
usePathway = trimPath
KOandCOwithKmeans = forRelatedness_regular #linked KO & CO, data, with K means information
useCO = CO_fromMATLAB #full list of CO data, need to color pathway, and lookup table
useKO = KO_Norm2Mean # full list of KO data, need to color pathway
dia = Insitu_TPM_DIA #diatom TPM, for plotting species
din = Insitu_TPM_DIN #dino TPM, for plotting species
oth = Insitu_TPM_Oth #other TPM, for plotting species
folder = 'plots_regular_Km'

In [38]:
gc_regular = fxn_gatherDetails.gatherDetails(makeNclusters,usePathway,KOandCOwithKmeans,folder,useCO,useKO,dia,din,oth)

In [39]:
#switch to inverse, repeat 
KOandCOwithKmeans = forRelatedness_inverse #linked KO & CO, data, with K means information
folder = 'plots_inverse_Km'

gc_inverse = fxn_gatherDetails.gatherDetails(makeNclusters,usePathway,KOandCOwithKmeans,folder,useCO,useKO,dia,din,oth)

In [40]:
#now...save all that so I don't have to do this everytime BUT be careful with re-assigning K means group numbers!
cpk.dump(gc_regular, open('gatherCounts_norm2mean_regular.2016.04.18.pickle', 'wb'))
cpk.dump(gc_inverse, open('gatherCounts_norm2mean_inverse.2016.04.18.pickle', 'wb'))

## this is the code to read in the file...can use this without having to go through the pain of rerunning gatherCounts
#gatherCounts_regular = cpk.load(open('gatherCounts_norm2mean_regular.2016.04.13.pickle','rb'))

In [41]:
def plotGenes_ratio(tg):
    #if this is something in the pathway, plot up the species for the K number
    if (tg in Insitu_TPM_DIA.index.tolist()) and (tg in Insitu_TPM_DIN.index.tolist()):
        DaDn=Insitu_TPM_DIA.loc[tg]/Insitu_TPM_DIN.loc[tg]

    if (tg in Insitu_TPM_DIA.index.tolist()) and (tg in Insitu_TPM_Oth.index.tolist()):
        DaOt=Insitu_TPM_DIA.loc[tg]/Insitu_TPM_Oth.loc[tg]

    if (tg in Insitu_TPM_DIN.index.tolist()) and (tg in Insitu_TPM_Oth.index.tolist()):
        DnOt=Insitu_TPM_DIN.loc[tg]/Insitu_TPM_Oth.loc[tg]

    useColors = pal.colorbrewer.qualitative.Set1_3.hex_colors    
    ms = 200 #set the size of the dots
    fig,axs=plt.subplots(1,3)
    fig.set_size_inches(10,4)
    axs[0].scatter(range(5),DaDn,s = ms, color = useColors[0])
    axs[0].set_title('DIA:DIN')
    axs[0].set_xticks(range(5))
    axs[0].set_ylabel('Ratios')

    axs[1].scatter(range(5),DaOt,s = ms, color = useColors[1])
    axs[1].set_title('DIA:Oth')
    axs[1].set_xticks(range(5))

    axs[2].scatter(range(5),DnOt,s = ms, color = useColors[2])
    axs[2].set_title('DIN:Oth')
    axs[2].set_xticks(range(5))

    #fig.savefig(directorySpecies + '/' + tg + '_species.png',bbox_inches='tight')
    #plt.close()

In [42]:
#not sure this is helpful since it would be another set of figures to wade through, and the 
#same information is already in the species plots...leave here to get a sense of what we 
#are looking for though in considering the ratios of the various TPMs
r = plotGenes_ratio('K01914')



In [43]:
#make a function to calculate the ratios of the three TPM groups
#function expects a version of the forRelatedness_XXXX dataframe:
def calcTPMratios(dataIn):

    #do NOT pull the KEGG numbers bc that causes issues downstream with the plotting: confusion of numbers/text
    df2 = dataIn.loc[:,('kmeans')]
    
    df2 = df2.to_frame() #this gets around the error about working on a copy of a slice
    df2['inList'] = ''

    for idx in range(0,len(df2)):
        fc = df2.index.values[idx]

        if fc[0] == 'C':
            df2.ix[idx,'inList'] = False
        elif fc[0] == 'K':
            df2.ix[idx,'inList'] = True  

    df2 = df2[df2['inList']==True]
    df2 = df2.T.drop('inList').T #this somehow gets around the warning about making a change on a copy...?

    for tg,row in df2.iterrows():

        if (tg in Insitu_TPM_DIA.index.tolist()) and (tg in Insitu_TPM_DIN.index.tolist()):
            #use this to get the mean, ignoring any inf/NaN values
            tv = Insitu_TPM_DIA.loc[tg,:]/Insitu_TPM_DIN.loc[tg,:]
            t = np.ma.masked_invalid(tv).mean()
            df2.loc[tg,'meanDaDn'] = t 
        else:
            df2.loc[tg,'meanDaDn'] = np.nan

        if (tg in Insitu_TPM_DIA.index.tolist()) and (tg in Insitu_TPM_Oth.index.tolist()):
            tv =  Insitu_TPM_DIA.loc[tg,:]/Insitu_TPM_Oth.loc[tg,:]
            t = np.ma.masked_invalid(tv).mean()
            df2.loc[tg,'meanDaOt'] = t
        else:
            df2.loc[tg,'meanDaOt'] = np.nan

        if (tg in Insitu_TPM_DIN.index.tolist()) and (tg in Insitu_TPM_Oth.index.tolist()):
            tv = Insitu_TPM_DIN.loc[tg,:]/Insitu_TPM_Oth.loc[tg,:]
            t = np.ma.masked_invalid(tv).mean()
            df2.loc[tg,'meanDnOt'] = t
        else:
            df2.loc[tg,'meanDnOt'] = np.nan
            
    return df2

In [44]:
out_regular = calcTPMratios(forRelatedness_regular)
out_inverse = calcTPMratios(forRelatedness_inverse)
#save inverse for later, this is already complicated enough with the regular K means groups

In [45]:
#not fully convinced this is actually interesting...the ratios in the extremes can be the cases
#with small # if TPM in denominator/numerator driving to large numbers (ie. 300 / 0.05))
axs = out_regular.boxplot(column = 'meanDaDn',by='kmeans')
axs.set_yscale('log')
axs.grid('off')
axs.set_title('meanDaDn' + ' regular')


Out[45]:
<matplotlib.text.Text at 0x95453278>

In [46]:
axs = out_regular.boxplot(column = 'meanDaOt',by='kmeans')
axs.set_yscale('log')
axs.grid('off')
axs.set_title('meanDaOt' + ' regular')


Out[46]:
<matplotlib.text.Text at 0x96746550>

In [47]:
axs = out_regular.boxplot(column = 'meanDnOt',by='kmeans')
axs.set_yscale('log')
axs.grid('off')
axs.set_title('meanDnOt' + ' regular')


Out[47]:
<matplotlib.text.Text at 0xbce789e8>

In [48]:
out_regular[out_regular.loc[:,'meanDaDn']>=1000]


Out[48]:
kmeans meanDaDn meanDaOt meanDnOt
K01914 3 8761.666667 879.138333 0.110476

In [49]:
out_regular.mean()


Out[49]:
kmeans       2.126677
meanDaDn    31.217918
meanDaOt    36.837100
meanDnOt    10.500923
dtype: float64

In [50]:
out_regular.groupby('kmeans').mean()


Out[50]:
meanDaDn meanDaOt meanDnOt
kmeans
0 42.025429 13.088527 4.428997
1 31.495108 93.007387 4.939637
2 11.093790 26.453004 10.039916
3 293.437380 88.722557 3.311839
4 1.573144 11.403837 39.140383
5 13.063875 220.244444 9.833333

In [51]:
#explore...plot one gene here rather than digging it up from the folders
kid='K01914'
allK=InsituTPMGrped[InsituTPMGrped['kID']==kid].sum()
allK=allK.drop('kID')
allK=allK.astype('float')
allK

if kid in Insitu_TPM_DIA.index.tolist():
    Dk=Insitu_TPM_DIA.loc[kid]
else: 
    Dk = 0/allK

if kid in Insitu_TPM_DIN.index.tolist():
    Nk=Insitu_TPM_DIN.loc[kid]
else:
    Nk = 0/allK
    
if kid in Insitu_TPM_Oth.index.tolist():
    Ok=Insitu_TPM_Oth.loc[kid]
else:
    Ok = 0/allK
    
fig,ax=plt.subplots(1)
ax.stackplot(range(5), Dk, Nk, Ok, colors=pal.colorbrewer.qualitative.Set3_6_r.hex_colors, lw=0)
ax.set_xticks(range(5))
ax.set_xticklabels([1,2,3,4,5])
plt.title(kid)
# ax.set_ylim(0,1)
#fig.savefig(kid +'_numbers_forTalk.pdf')

#there are very small values here for DIN and other, but can't see them on the figure


Out[51]:
<matplotlib.text.Text at 0xa42fc128>

In [52]:
#function expects a version of the forRelatedness_XXXX dataframe:
def averageTPM(dataIn):

    #do NOT pull the KEGG numbers bc that causes issues downstream with the plotting: confusion of numbers/text
    df2 = dataIn.loc[:,('kmeans')]
    
    df2 = df2.to_frame() #this gets around the error about working on a copy of a slice
    df2['inList'] = ''

    for idx in range(0,len(df2)):
        fc = df2.index.values[idx]
        if fc[0] == 'C':
            df2.ix[idx,'inList'] = False
        elif fc[0] == 'K':
            df2.ix[idx,'inList'] = True  

    df2 = df2[df2['inList']==True]
    df2 = df2.T.drop('inList').T #this somehow gets around the warning about making a change on a copy...?

    for tg,row in df2.iterrows():

        if (tg in Insitu_TPM_DIA.index.tolist()) :
            #use this to get the mean, ignoring any inf/NaN values
            tv = Insitu_TPM_DIA.loc[tg,:]
            t = np.ma.masked_invalid(tv).mean()
            df2.loc[tg,'meanDia'] = t 
        else:
            df2.loc[tg,'meanDia'] = np.nan

        if (tg in Insitu_TPM_DIN.index.tolist()) :
            tv =  Insitu_TPM_DIN.loc[tg,:]
            t = np.ma.masked_invalid(tv).mean()
            df2.loc[tg,'meanDin'] = t
        else:
            df2.loc[tg,'meanDin'] = np.nan

        if (tg in Insitu_TPM_Oth.index.tolist()):
            tv = Insitu_TPM_Oth.loc[tg,:]
            t = np.ma.masked_invalid(tv).mean()
            df2.loc[tg,'meanOth'] = t
        else:
            df2.loc[tg,'meanOth'] = np.nan
            
    return df2

In [53]:
testing = averageTPM(forRelatedness_regular)

In [54]:
testing.head(5)


Out[54]:
kmeans meanDia meanDin meanOth
K00509 5 8.773276 NaN 0.020368
K00503 2 1.600686 2.606638 0.030093
K00500 2 14.198886 3.270149 0.039838
K00505 2 4.244865 0.877827 2.639846
K01101 2 50.637024 8.752114 2.948806

In [55]:
#but...ask Harriet whether she thinks 'average TPM' is really a valid concept here
#also, remember the different K means group have different number of genes...so, the dip
#in 'other' in K means group 4 is only considering 8 genes.

In [56]:
use = 'meanDia'
axs = testing.boxplot(column = use,by='kmeans')
axs.set_yscale('log')
axs.grid('off')
axs.set_title(use + ' regular')


Out[56]:
<matplotlib.text.Text at 0xa4477400>

In [57]:
use = 'meanDin'
axs = testing.boxplot(column = use,by='kmeans')
axs.set_yscale('log')
axs.grid('off')
axs.set_title(use + ' regular')


Out[57]:
<matplotlib.text.Text at 0x931ab710>

In [58]:
use = 'meanOth'
axs = testing.boxplot(column = use,by='kmeans')
axs.set_yscale('log')
axs.grid('off')
axs.set_title(use + ' regular')


Out[58]:
<matplotlib.text.Text at 0x1018515c0>

In [59]:
colLabels = list(gc_regular.columns.values)

In [60]:
colLabels


Out[60]:
['nCpds',
 'nGenes',
 'Km0_cpd',
 'Km0_gene',
 'Km1_cpd',
 'Km1_gene',
 'Km2_cpd',
 'Km2_gene',
 'Km3_cpd',
 'Km3_gene',
 'Km4_cpd',
 'Km4_gene',
 'Km5_cpd',
 'Km5_gene',
 'pathwayInfo',
 'pathwayGroup_A',
 'pathwayGroup_B',
 'pathwayGroup_C']

In [61]:
cpdLabels = colLabels[2:13:2] #odd Python syntax, from 2 to 13, in steps of 2)
cpdLabels


Out[61]:
['Km0_cpd', 'Km1_cpd', 'Km2_cpd', 'Km3_cpd', 'Km4_cpd', 'Km5_cpd']

In [62]:
geneLabels = colLabels[3:14:2] #odd Python syntax, from 2 to 13, in steps of 2)
geneLabels


Out[62]:
['Km0_gene', 'Km1_gene', 'Km2_gene', 'Km3_gene', 'Km4_gene', 'Km5_gene']

In [63]:
#I have put the bar graphs back in here since I think they are the simplest way to consider these data

In [64]:
##let's narrow in on the metabolism group since that is the only one that is really interesting
plotMetabolism = gc_regular[gc_regular.loc[:,'pathwayGroup_A']=='Metabolism']

In [65]:
s = plotMetabolism[(plotMetabolism.loc[:,colLabels].values > 0).any(axis=1)]
dataToPlot = s.loc[:,colLabels]

In [66]:
dataToPlot.head(5)


Out[66]:
nCpds nGenes Km0_cpd Km0_gene Km1_cpd Km1_gene Km2_cpd Km2_gene Km3_cpd Km3_gene Km4_cpd Km4_gene Km5_cpd Km5_gene pathwayInfo pathwayGroup_A pathwayGroup_B pathwayGroup_C
ko00010 31 96 1 0 0 1 0 14 0 1 0 0 3 0 Glycolysis / Gluconeogenesis Metabolism Carbohydrate metabolism Glycolysis / Gluconeogenesis
ko00020 20 57 0 2 0 2 3 16 0 0 0 1 2 0 Citrate cycle (TCA cycle) Metabolism Carbohydrate metabolism Citrate cycle (TCA cycle)
ko00030 35 76 0 0 0 0 0 2 0 1 0 2 2 0 Pentose phosphate pathway Metabolism Carbohydrate metabolism Pentose phosphate pathway
ko00040 55 63 0 0 0 0 1 7 0 1 0 0 1 0 Pentose and glucuronate interconversions Metabolism Carbohydrate metabolism Pentose and glucuronate interconversions
ko00051 53 98 0 1 4 0 6 11 0 0 0 0 7 0 Fructose and mannose metabolism Metabolism Carbohydrate metabolism Fructose and mannose metabolism

In [67]:
pGroup = pd.unique(plotMetabolism.pathwayGroup_B.ravel())
pGroup


Out[67]:
array(['Carbohydrate metabolism', 'Lipid metabolism',
       'Metabolism of cofactors and vitamins', 'Energy metabolism',
       'Amino acid metabolism', 'Nucleotide metabolism',
       'Biosynthesis of other secondary metabolites',
       'Metabolism of terpenoids and polyketides',
       'Xenobiotics biodegradation and metabolism',
       'Metabolism of other amino acids',
       'Glycan biosynthesis and metabolism', 'Global and overview maps'], dtype=object)

In [68]:
s = plotMetabolism[(plotMetabolism.loc[:,colLabels].values > 0).any(axis=1)]
dataToPlot = s.loc[:,colLabels]

pGroup = pd.unique(plotMetabolism.pathwayGroup_B.ravel())
newDFmtab = pd.DataFrame(index = pGroup,columns = cpdLabels)

for i, group in s.groupby('pathwayGroup_B'):
    d2 = group.loc[:,cpdLabels]
    out = d2.sum(axis=0)
    newDFmtab.loc[i,cpdLabels] = out

# The global and overview maps will be empty, so get rid of it
newDFmtab = newDFmtab.drop(['Global and overview maps'])

In [69]:
useColors = pal.colorbrewer.qualitative.Set1_6.hex_colors
newDFmtab.plot(kind = 'barh',color = useColors, figsize = (12,12))


Out[69]:
<matplotlib.axes._subplots.AxesSubplot at 0x10174b860>

In [70]:
pGroup = pd.unique(plotMetabolism.pathwayGroup_B.ravel())
newDFmtab = pd.DataFrame(index = pGroup,columns = geneLabels)

for i, group in s.groupby('pathwayGroup_B'):
    d2 = group.loc[:,geneLabels]
    out = d2.sum(axis=0)
    newDFmtab.loc[i,geneLabels] = out

# The global and overview maps will be empty, so get rid of it
newDFmtab = newDFmtab.drop(['Global and overview maps'])

In [71]:
useColors = pal.colorbrewer.qualitative.Set1_6.hex_colors
newDFmtab.plot(kind = 'barh',color = useColors, figsize = (12,12))


Out[71]:
<matplotlib.axes._subplots.AxesSubplot at 0xcc9992b0>

In [72]:
#useColors = pal.colorbrewer.qualitative.Set3_11.hex_colors
useColors = pal.colorbrewer.diverging.RdYlBu_11.hex_colors
newDFmtab.T.plot(kind = 'barh',color = useColors, figsize = (12,12))


Out[72]:
<matplotlib.axes._subplots.AxesSubplot at 0xd0a9e668>

In [73]:
s = plotMetabolism[(plotMetabolism.loc[:,colLabels].values > 0).any(axis=1)]
dataToPlot = s.loc[:,colLabels]

pGroup = pd.unique(plotMetabolism.pathwayGroup_B.ravel())
newDFmtab = pd.DataFrame(index = pGroup,columns = cpdLabels)

for i, group in s.groupby('pathwayGroup_C'):
    d2 = group.loc[:,cpdLabels]
    out = d2.sum(axis=0)
    newDFmtab.loc[i,cpdLabels] = out

# The global and overview maps will be empty, so get rid of it
newDFmtab = newDFmtab.drop(['Global and overview maps'])

useColors = pal.colorbrewer.qualitative.Set1_6.hex_colors
newDFmtab.plot(kind = 'barh',color = useColors,figsize = (12,192))


Out[73]:
<matplotlib.axes._subplots.AxesSubplot at 0xe3df1e80>

In [74]:
s = plotMetabolism[(plotMetabolism.loc[:,geneLabels].values > 0).any(axis=1)]
dataToPlot = s.loc[:,geneLabels]

pGroup = pd.unique(plotMetabolism.pathwayGroup_B.ravel())
newDFmtab = pd.DataFrame(index = pGroup,columns = geneLabels)

for i, group in s.groupby('pathwayGroup_C'):
    d2 = group.loc[:,geneLabels]
    out = d2.sum(axis=0)
    newDFmtab.loc[i,geneLabels] = out

# The global and overview maps will be empty, so get rid of it
newDFmtab = newDFmtab.drop(['Global and overview maps'])

useColors = pal.colorbrewer.qualitative.Set1_6.hex_colors
newDFmtab.plot(kind = 'barh',color = useColors,figsize = (12,192))


Out[74]:
<matplotlib.axes._subplots.AxesSubplot at 0x26200a90>

Liz wants to think about ratio of genes at one time point versus another...I think this will be driven by the small numbers again


In [75]:
dtm = forRelatedness_regular.loc[:,'S2']/forRelatedness_regular.loc[:,'S3']
#how many cases do we have with genes/compounds 10x greater at S2?
r = dtm[dtm>10]
r.replace(np.inf,np.nan).dropna() #replace inf with NaN, then drop the NaN


Out[75]:
K00509     35.550889
K13481     10.873400
K01243     12.021647
K14264     40.887580
K00944     14.301025
K00790     17.838439
K09843     11.771332
K00454     12.696537
K17690     14.747486
RI67       44.783023
RI68       44.783023
RI182      63.015188
RI184      63.015188
RI198      63.015188
RI200      63.015188
RI257      26.824789
RI267      26.824789
RI369     142.486187
RI481      21.561016
RI483      21.561016
RI543     264.225210
RI544     264.225210
RI554     264.225210
RI556     264.225210
RI560     264.225210
RI573      11.433061
RI581      11.433061
RI650      54.058096
RI661      13.335896
RI662      13.335896
             ...    
RI4477     18.783764
RI4478     20.392295
RI4479     18.783764
RI4480     20.392295
RI4481     18.783764
RI4482     20.392295
RI4483     18.783764
RI4491     14.602895
RI4503     14.602895
RI4515     14.602895
RI4527     14.602895
RI4716     74.966762
RI4718     74.966762
RI4724     74.966762
RI4730     74.966762
RI4732     74.966762
RI4738     74.966762
RI4762     39.965927
RI4764     32.279442
RI4765     32.279442
RI4766     32.279442
RI4820     10.587128
RI4826     10.587128
RI4828     10.587128
RI4829     10.587128
RI4868     14.384067
RI4869     14.384067
RI4878     15.880987
RI4914     36.837767
RI4916     40.171429
dtype: float64

In [76]:
ax2 = dtm.plot.box()
ax2.set_yscale('log')


Consider diversity of other EC/genes (lots of examples follow)


In [77]:
def plotOneGene(kid):#explore...plot one gene here rather than digging it up from the folders
    #kid='K03183'
    allK=InsituTPMGrped[InsituTPMGrped['kID']==kid].sum()
    allK=allK.drop('kID')
    allK=allK.astype('float')
    allK

    if kid in Insitu_TPM_DIA.index.tolist():
        Dk=Insitu_TPM_DIA.loc[kid]
    else: 
        Dk = 0/allK

    if kid in Insitu_TPM_DIN.index.tolist():
        Nk=Insitu_TPM_DIN.loc[kid]
    else:
        Nk = 0/allK

    if kid in Insitu_TPM_Oth.index.tolist():
        Ok=Insitu_TPM_Oth.loc[kid]
    else:
        Ok = 0/allK
        
    #try not to plot things that are all NaN (though this will allow 'zeros', and
    #definitely have cases with 0 and NaN for a given gene)    
    if not Dk.dropna().empty and not Nk.dropna().empty and not Ok.dropna().empty: #only plot with data!
        fig,ax=plt.subplots(1)
        ax.stackplot(range(5), Dk, Nk, Ok, colors=pal.colorbrewer.qualitative.Set3_6_r.hex_colors, lw=0)
        ax.set_xticks(range(5))
        ax.set_xticklabels([1,2,3,4,5])
        fig.set_size_inches(2,2) #this is a property of the figure, not the axis...
        plt.title(kid)
        # ax.set_ylim(0,1)
        #fig.savefig(kid +'_numbers_forTalk.pdf')

In [78]:
#given a gene, go get all possible EC numbers, use them to search for other genes with that EC, and plot
#this will skip over genes that are not in the RI data
def plotMultipleEC(kid):
    getOne = kegg_list(kid).read()
    geneList = []

    for line in getOne.rstrip().split('\n'):
        r1 = line.find("[EC:")
        r2 = line.find("]")
        r = line[(r1+4):r2]
        r3 = r.split(' ')
        #now that I have the EC numbers...get the genes for each one
        for idx in r3:
            getK = kegg_link('ko',idx).read()
            for line3 in re.split('\n',getK):
                if line3 is not '':
                    ge = line3.find('ko')
                    t = line3[ge+3:]
                    geneList.append(t)
    geneList = set(geneList) #this will make the list of unique genes...can print that up

    for idx in geneList:
        plotOneGene(idx) #plotting function has already been defined above...

In [79]:
#in some cases, only get back the one gene we started with
plotMultipleEC('K01830')



In [80]:
##KL note for Harriet: this is an ADP-ribose pyrophosphate...in the upper left corner 
##of the purine pathway
plotMultipleEC('K13988')



In [81]:
#example of something high at S2, but another version of the gene is steady throughout
#lipid: arachidonic acid metabolism
plotMultipleEC('K00509')



In [82]:
plotMultipleEC('K17690')



In [148]:
plotMultipleEC('K12343')

In [ ]:


In [ ]:


In [ ]:


In [85]:
#leave bits below for the moment...

In [90]:
type(InsituTPMGrped)


Out[90]:
pandas.core.frame.DataFrame

In [92]:
InsituTPMGrped.head(5)


Out[92]:
kID S1 S2 S3 S4 S5
sgID
Aletam K00001 0.000000 0.000000 0.011808 0.079876 0.000000
Ampcof K00001 0.011244 0.044565 0.011808 0.000000 0.000000
Ampmas K00001 0.011244 0.014855 0.035423 0.053250 0.000000
Astgla K00001 0.000000 0.000000 0.011808 0.000000 0.000000
Attsep K00001 0.112442 0.133696 0.047231 0.026625 0.178677

In [94]:
dayList


Out[94]:
['S1', 'S2', 'S3', 'S4', 'S5']

In [99]:
InsituTPMGrped['mean'] = InsituTPMGrped[dayList].mean(axis=1)

In [100]:
InsituTPMGrped.head(5)


Out[100]:
kID S1 S2 S3 S4 S5 mean
sgID
Aletam K00001 0.000000 0.000000 0.011808 0.079876 0.000000 0.018337
Ampcof K00001 0.011244 0.044565 0.011808 0.000000 0.000000 0.013523
Ampmas K00001 0.011244 0.014855 0.035423 0.053250 0.000000 0.022955
Astgla K00001 0.000000 0.000000 0.011808 0.000000 0.000000 0.002362
Attsep K00001 0.112442 0.133696 0.047231 0.026625 0.178677 0.099734

In [101]:


In [104]:
InsituTPMGrped.shape


Out[104]:
(266871, 7)

In [ ]:


In [103]:
InsituTPMGrped['mean'].max()


Out[103]:
984.59611900226923

In [106]:
Insitu_TPM_DIA.shape


Out[106]:
(5846, 5)

In [107]:
Insitu_TPM_DIN.shape


Out[107]:
(6608, 5)

In [108]:
Insitu_TPM_Oth.shape


Out[108]:
(5214, 5)

In [ ]:


In [102]:
plt.hist(InsituTPMGrped['mean'])


Out[102]:
(array([  2.66783000e+05,   6.70000000e+01,   1.30000000e+01,
          6.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.00000000e+00]),
 array([  2.24883660e-03,   9.84616359e+01,   1.96921023e+02,
          2.95380410e+02,   3.93839797e+02,   4.92299184e+02,
          5.90758571e+02,   6.89217958e+02,   7.87677345e+02,
          8.86136732e+02,   9.84596119e+02]),
 <a list of 10 Patch objects>)

In [ ]:


In [115]:
Insitu_TPM_DIA['mean'] = Insitu_TPM_DIA[dayList].mean(axis=1)
Insitu_TPM_DIN['mean'] = Insitu_TPM_DIN[dayList].mean(axis=1)
Insitu_TPM_Oth['mean'] = Insitu_TPM_Oth[dayList].mean(axis=1)

In [110]:
Insitu_TPM_DIA.head(5)


Out[110]:
S1 S2 S3 S4 S5 mean
kID
K00001 2.844778 3.758330 3.931973 4.952283 6.611035 4.419680
K00002 2.878511 2.718476 1.735736 2.343015 2.211123 2.377372
K00003 27.053504 16.593101 3.790281 2.795643 3.015168 10.649539
K00004 0.067465 0.148551 0.177116 0.292877 0.268015 0.190805
K00005 0.123686 0.103985 0.259770 0.319502 0.312684 0.223926

In [120]:
fig= plt.hist(Insitu_TPM_DIA['mean'],bins = 100,histtype='stepfilled',color = 'r',label = 'DIA',log=True)



In [121]:
fig = plt.hist(Insitu_TPM_DIN['mean'],bins = 100,histtype='stepfilled',color = 'g',alpha = 0.25,label = 'DIN',log = True)



In [122]:
fig = plt.hist(Insitu_TPM_Oth['mean'],bins = 100,histtype='stepfilled',color = 'b',alpha = 0.75,label = 'Oth',log = True)



In [123]:
pruneDIA = Insitu_TPM_DIA.copy(deep=True)

In [125]:
pruneDIA.head(5)


Out[125]:
S1 S2 S3 S4 S5 mean
kID
K00001 2.844778 3.758330 3.931973 4.952283 6.611035 4.419680
K00002 2.878511 2.718476 1.735736 2.343015 2.211123 2.377372
K00003 27.053504 16.593101 3.790281 2.795643 3.015168 10.649539
K00004 0.067465 0.148551 0.177116 0.292877 0.268015 0.190805
K00005 0.123686 0.103985 0.259770 0.319502 0.312684 0.223926

In [127]:
pruneDIA.drop('mean',axis=1,inplace=True)

In [129]:
k = pruneDIA<2

In [131]:
pruneDIA[k] = np.nan

In [137]:
pruneDIA['mean'] = pruneDIA[dayList].mean(axis=1)

In [140]:
#need dropna or else the histogram will barf
fig = plt.hist(pruneDIA['mean'].dropna(),log = True)



In [142]:
pruneDIN = Insitu_TPM_DIN[dayList].mean(axis=1)

In [143]:
fig = plt.hist(pruneDIN.dropna(),log=True)



In [157]:
fig = plt.hist(Insitu_TPM_Oth[dayList].mean(axis=1).dropna(),log=True)



In [ ]:


In [ ]:


In [171]:
def plotOneGene_pruned(kid):#explore...plot one gene here rather than digging it up from the folders
    #kid='K03183'
    allK=InsituTPMGrped[InsituTPMGrped['kID']==kid].sum()
    allK=allK.drop('kID')
    allK=allK.astype('float')
    allK

    k = Insitu_TPM_DIA < 2
    Insitu_TPM_DIA[k] = 0
    
    k = Insitu_TPM_DIN < 2
    Insitu_TPM_DIN[k] = 0
    
    k = Insitu_TPM_Oth < 2
    Insitu_TPM_Oth[k] = 0
    
    if kid in Insitu_TPM_DIA.index.tolist():
        Dk=Insitu_TPM_DIA.loc[kid]
    else: 
        Dk = 0/allK

    if kid in Insitu_TPM_DIN.index.tolist():
        Nk=Insitu_TPM_DIN.loc[kid]
    else:
        Nk = 0/allK

    if kid in Insitu_TPM_Oth.index.tolist():
        Ok=Insitu_TPM_Oth.loc[kid]
    else:
        Ok = 0/allK
        
    #try not to plot things that are all NaN (though this will allow 'zeros', and
    #definitely have cases with 0 and NaN for a given gene)    
    if not Dk.dropna().empty and not Nk.dropna().empty and not Ok.dropna().empty: #only plot with data!
        fig,ax=plt.subplots(1)
        ax.stackplot(range(5), Dk, Nk, Ok, colors=pal.colorbrewer.qualitative.Set3_6_r.hex_colors, lw=0)
        ax.set_xticks(range(5))
        ax.set_xticklabels([1,2,3,4,5])
        fig.set_size_inches(2,2) #this is a property of the figure, not the axis...
        plt.title(kid)
        # ax.set_ylim(0,1)
        #fig.savefig(kid +'_numbers_forTalk.pdf')
        
#given a gene, go get all possible EC numbers, use them to search for other genes with that EC, and plot
#this will skip over genes that are not in the RI data
def plotMultipleEC_2(kid):
    getOne = kegg_list(kid).read()
    geneList = []

    for line in getOne.rstrip().split('\n'):
        r1 = line.find("[EC:")
        r2 = line.find("]")
        r = line[(r1+4):r2]
        r3 = r.split(' ')
        #now that I have the EC numbers...get the genes for each one
        for idx in r3:
            getK = kegg_link('ko',idx).read()
            for line3 in re.split('\n',getK):
                if line3 is not '':
                    ge = line3.find('ko')
                    t = line3[ge+3:]
                    geneList.append(t)
    geneList = set(geneList) #this will make the list of unique genes...can print that up

    for idx in geneList:
        plotOneGene_pruned(idx) #plotting function has already been defined above...

In [172]:
plotMultipleEC_2('K13988')



In [166]:
Dk


Out[166]:
S1      2.878511
S2      2.718476
S3           NaN
S4      2.343015
S5      2.211123
mean    2.377372
Name: K00002, dtype: float64

In [ ]:


In [ ]: